install.packages('fastDummies')
library(fastDummies)
library(tidyverse)  # ggplot(), %>%, mutate(), and friends 
library(broom)
library(MatchIt)  # Match things
library(Rcpp)
library(MASS)
library(modelsummary)
library(IRdisplay)
library(haven)
library(dplyr)

getwd()


nibrs_all <- readRDS('nibrs91_20_with_delay.rds')

# modify the victims segments variable as it includes also other victims that were not murdered/killed
df <- nibrs_all %>% add_count(unique_incident_id)
df$diff <- df$total_victim_segments-df$n
df$n[df$diff<0] <- 10000000
df$n[df$n == 10000000] <- df$total_victim_segments
df = dplyr::rename(df, total_victims = n)

# select only solved
df<-df[df$Solved == 1,]
# time to arrest, for those cleared exceptionally
df$exceptional_clearance_date <- as.Date(df$exceptional_clearance_date , format = "%Y-%m-%d")
df$incident_date.x <- as.Date(df$incident_date.x , format = "%Y-%m-%d")
df$diff_in_days_ex<- difftime(df$exceptional_clearance_date ,df$incident_date.x , units = c("days"))
df%>%count(diff_in_days_ex)
#time to arrest, for all the others
df$arrest_date <- as.Date(df$arrest_date , format = "%Y-%m-%d")
df$diff_in_days <- difftime(df$arrest_date ,df$incident_date.x , units = c("days"))
df%>%count(diff_in_days)


# now create column that takes day difference for both exceptionally cleared, and all the others
df$days_to_arrest <- ifelse(
  is.na(df$diff_in_days_ex),
  df$diff_in_days,
  df$diff_in_days_ex
)
df%>%count(days_to_arrest)


# check n of nas
sum(is.na(df$days_to_arrest))

# remove odd values and nans (odd values mean arrest is prior to actual crime)
df <- df[!is.na(df$days_to_arrest) & df$days_to_arrest >= 0, ]
# select only relevant columns
df <- dplyr::select(df, c('year.x', 'state.x', 'city_name', 'population_group', 'state_abbreviation', 'incident_date.x', 'age_of_victim', 'sex_of_victim',
                           'race_of_victim', 'resident_status_of_victim', 'agg_assault_homicide_circumsta1',
                           'ucr_offense_code', 'location_type', 'type_weapon_force_involved_1', 'agency_indicator', 'incident_date_hour', 
                           'country_division','country_region','total_victim_segments', 'total_victims', 'total_offender_segments','relation_of_vict_to_offender1', 'days_to_arrest'
                           ))

# capitalize State
df$state.x <- str_to_title(df$state.x)

# remove capitalization on all letters, keep only first, on city_name
df$city_name <- str_to_title(df$city_name)

# get month
library(lubridate)
df$incident_date <- as.POSIXlt(df$incident_date.x, format="%Y-%m-%d")
df$month <- month(df$incident_date, label=TRUE, abbr=FALSE)

### homicide overlap variable
df$year2 <- df$year.x
df$Code_Overlap <- paste(df$state.x, df$city_name, df$year2, df$month, sep="_")
df$Code_Overlap_Agency <- paste(df$state.x, df$city_name, df$agency_indicator, df$year2, df$month, sep="_")
df$Monthly_Agency_Overlap<- duplicated(df$Code_Overlap_Agency)
df$Monthly_Agency_Overlap <- as.numeric(df$Monthly_Agency_Overlap)
table(df$Monthly_Agency_Overlap) # check number of overlapping homicides


# decade function
df <- df%>% mutate(decade = case_when(
  year.x>='1990' & year.x <'2000' ~ "90s",
  year.x>='2000' & year.x <'2010' ~ "00s",
  year.x>='2010' & year.x <='2020' ~ "10s",
  TRUE ~ "undetermined"
))

# FiveY function
df <- df%>% mutate(FiveY = case_when(
  year.x>='1990' & year.x <'1995' ~ "1990-1994",
  year.x>='1995' & year.x <'2000' ~ "1995-1999",
  year.x>='2000' & year.x <'2005' ~ "2000-2004",
  year.x>='2005' & year.x <'2010' ~ "2005-2009",
  year.x>='2010' & year.x <'2015' ~ "2010-2014",
  year.x>='2015' & year.x <='2020' ~ "2015-2020",
  TRUE ~ "undetermined"
))

# age function

df$age_of_victim <- str_replace_all(df$age_of_victim, '1-6 days old', '0')
df$age_of_victim <- str_replace_all(df$age_of_victim, 'over 98 years old', '99')
df$age_of_victim <- str_replace_all(df$age_of_victim, 'under 24 hours \\(neonate\\)', '0')
df$age_of_victim <- str_replace_all(df$age_of_victim, '7-364 days old', '0')
df$age_of_victim <- str_replace_all(df$age_of_victim, 'unknown', '999')

df$age_of_victim2 <- as.numeric(df$age_of_victim)

table(df$age_of_victim2)


df <- df%>% mutate(agecat = case_when(
  age_of_victim2>=0 & age_of_victim2<=5~ "0-5",
  age_of_victim2>5 & age_of_victim2<=10~ "6-10",
  age_of_victim2>10 & age_of_victim2<=15~ "11-15",
  age_of_victim2>15 & age_of_victim2<=20~ "16-20",
  age_of_victim2>20 & age_of_victim2<=25~ "21-25",
  age_of_victim2>25 & age_of_victim2<=30~ "26-30",
  age_of_victim2>30 & age_of_victim2<=35~ "31-35",
  age_of_victim2>35 & age_of_victim2<=40~ "36-40",
  age_of_victim2>40 & age_of_victim2<=45~ "41-45",
  age_of_victim2>45 & age_of_victim2<=50~ "46-50",
  age_of_victim2>50 & age_of_victim2<=55~ "51-55",
  age_of_victim2>55 & age_of_victim2<=60~ "56-60",
  age_of_victim2>60 & age_of_victim2<=65~ "61-65",
  age_of_victim2>65 & age_of_victim2<=70~ "66-70",
  age_of_victim2>70 & age_of_victim2<=75~ "71-75",
  age_of_victim2>75 & age_of_victim2<=80~ "76-80",
  age_of_victim2>80 & age_of_victim2<=85~ "81-85",
  age_of_victim2>85 & age_of_victim2<=90~ "86-90",
  age_of_victim2>90 & age_of_victim2<=95~ "91-95",
  age_of_victim2>95 & age_of_victim2<=99~ "96-99",
  TRUE ~ "999"
))

# remove unknown age observations
df <- filter(df, age_of_victim!='999')

# rename some columns
df = dplyr::rename(df, weapon = type_weapon_force_involved_1,
          circumstance = agg_assault_homicide_circumsta1,
          vic_resident_status = resident_status_of_victim,
          vic_relation=relation_of_vict_to_offender1)

# check nas
sapply(df, function(x) sum(is.na(x)))




# substitute nas with unknown
df$incident_date_hour[is.na(df$incident_date_hour)] <- "unknown"
df$vic_resident_status[is.na(df$vic_resident_status)] <- "unknown"

# create dummies for relevant columns (check whether ucr_offense_code is relevant)
df2 <- dummy_cols(df, select_columns=c('race_of_victim'),
                 remove_selected_columns = TRUE)


# remove unused
df2 <- dplyr::select(df2, -c('race_of_victim_american indian/alaskan native',
                            'race_of_victim_asian/pacific islander', 'race_of_victim_unknown',
                            'race_of_victim_white'))












